In [ ]:
import plotly.tools as tls
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display
import plotly.plotly as py # interactive graphing
import plotly.graph_objs as go
from plotly.graph_objs import Bar, Scatter, Marker, Layout, Histogram, Box
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
import psycopg2 as pg
#load python script that batch loads pandas df to sql
import cStringIO
In [ ]:
#py.sign_in('bottydim', 'o1kuyms9zv')
init_notebook_mode()
In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
db_name = 'c1_agg.db'
db_name = 'ccfd.db'
In [ ]:
# chunk-by-chunk append to sql
df_evnt = pd.read_csv(data_dir+evt_name)
In [ ]:
df_evnt
In [ ]:
df_evnt.isnull().sum()
In [ ]:
df_evnt = df
df_evnt.columns
In [ ]:
len(df_evnt.columns)
In [ ]:
df_evnt['AUTHZN_MSG_TYPE_CD'].unique()
In [ ]:
df.columns
In [ ]:
len(df_evnt.columns)
In [ ]:
df.loc[:,'Dahead().apply(lambda x: getTime(x),1)
In [ ]:
!wc -l < {data_dir+evt_name} # Number of lines in dataset
In [ ]:
!wc -l < {data_dir+auth_name}
107,565,191
In [ ]:
display(pd.read_csv(data_dir+auth_name, nrows=2).head())
In [ ]:
disk_engine = create_engine('sqlite:///'+data_dir+db_name,convert_unicode=True)
disk_engine.raw_connection().connection.text_factory = str
# Initializes database with filename in current directory
In [ ]:
# engine = create_engine(
# "postgresql+pg8000://botty:botty@localhost/ccfd",
# isolation_level="AUTOCOMMIT"
# )
engine = create_engine(
"postgresql://script@localhost:5432/ccfd",
isolation_level="AUTOCOMMIT",
pool_size=20
)
disk_engine = engine
In [ ]:
table = 'event'
In [ ]:
create_qry = ' DROP TABLE IF EXISTS {table}; \
CREATE TABLE {table} \
( \
index bigint, \
acct_id text, \
"AUTHZN_RQST_PROC_TM" timestamp without time zone, \
"AUTHZN_APPRL_CD" text, \
"AUTHZN_AMT" double precision, \
"MRCH_NM" text, \
"MRCH_CITY_NM" text, \
"MRCH_PSTL_CD" double precision, \
"MRCH_CNTRY_CD" text, \
"MRCH_ID" text, \
"TRMNL_ID" double precision, \
"MRCH_CATG_CD" text, \
"POS_ENTRY_MTHD_CD" double precision, \
"POS_COND_CD" double precision, \
"TRMNL_CLASFN_CD" double precision, \
"TRMNL_CAPBLT_CD" double precision, \
"TRMNL_PIN_CAPBLT_CD" double precision, \
"TSYS_DCLN_REAS_CD" double precision, \
"MRCH_TMP_PRTPN_IND" text, \
"AUTHZN_MSG_TYPE_MODR_CD" text, \
"AUTHZN_ACCT_STAT_CD" text, \
"AUTHZN_MSG_TYPE_CD" bigint, \
"AUTHZN_RQST_TYPE_CD" bigint, \
"AUTHZN_RESPNS_CD" bigint, \
"ACCT_STAT_REAS_NUM" bigint, \
"RQST_CARD_SEQ_NUM" text, \
"PIN_OFST_IND" bigint, \
"PIN_VLDTN_IND" text, \
"CARD_VFCN_REJ_CD" double precision, \
"CARD_VFCN_RESPNS_CD" text, \
"CARD_VFCN2_RESPNS_CD" text, \
"CAVV_CD" double precision, \
"ECMRC_SCURT_CD" text, \
"ACQR_BIN_NUM" text, \
"ACQR_CRCY_CD" double precision, \
"CRCY_CNVRSN_RT" bigint, \
"AUTHZN_APPRD_AMT" double precision, \
"PRIOR_MONEY_AVL_AMT" double precision, \
"PRIOR_CASH_AVL_AMT" double precision, \
"ACCT_CL_AMT" double precision, \
"ACCT_CURR_BAL" double precision, \
"PREV_ADR_CHNG_DT" timestamp without time zone, \
"PREV_PMT_DT" timestamp without time zone, \
"PREV_PMT_AMT" double precision, \
"PREV_CARD_RQST_DT" timestamp without time zone, \
"FRD_IND" text, \
"FRD_IND_SWT_DT" timestamp without time zone \
);'.format(table=table)
In [ ]:
# address = 'postgresql://<username>:<pswd>@<host>:<port>/<database>'
address = 'postgresql://script@localhost:5432/ccfd'
engine = create_engine(address)
connection = engine.raw_connection()
cursor = connection.cursor()
In [ ]:
for c,name in enumerate(col_names):
if name =='index':
continue
cursor.execute('''CREATE INDEX id_auth_{col}
ON {table} ({col})'''.format(table=table,col=name))
connection.commit()
print 'idxs created!'
In [ ]:
table = 'auth'
cursor.execute('''CREATE INDEX id_{table}_acct_id_tm
ON {table} (acct_id,AUTHZN_RQST_PROC_TM)'''.format(table=table))
cursor.execute('''CREATE INDEX id_{table}_tm_frd
ON {table} (AUTHZN_RQST_PROC_TM,FRD_IND_SWT_DT)'''.format(table=table))
connection.commit()
In [ ]:
cursor.__dict__
In [ ]:
cursor.execute(create_qry)
connection.commit()
In [ ]:
with engine.connect() as conn:
# conn.execute('drop TABLE event;')
In [ ]:
with engine.connect() as conn:
conn.execute('CREATE TABLE table_name ( \
column_name1 col_type (field_length) column_constraints,\
column_name2 col_type (field_length),\
column_name3 col_type (field_length));')
In [ ]:
print disk_engine.raw_connection().connection.text_factory
disk_engine.raw_connection().connection.text_factory = lambda x: unicode(x, 'utf-8', 'ignore')
print disk_engine.raw_connection().connection.text_factory
In [ ]:
df_trim = pd.read_sql_query('SELECT * FROM {} LIMIT 10000'.format(table), disk_engine)
In [ ]:
df_trim.head()
In [ ]:
table = 'event'
# df_trim.to_sql(table, disk_engine, if_exists='replace')
In [ ]:
####################CSV transfer!!!!!!!!!!!
start = dt.datetime.now()
chunksize = 300000
j = 0
index_start = 1
###################data source
file_loc = data_dir+evt_name
########################
dtFormat = "%d%b%Y %H:%M:%S.%f"
def getTime(x):
dtString = "{} {}".format(x.AUTHZN_RQST_PROC_DT,x.AUTHZN_RQST_PROC_TM)
return dt.datetime.strptime(dtString,dtFormat)
for df in pd.read_csv(file_loc, chunksize=chunksize, iterator=True,encoding='ISO-8859-1'):
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns
# df['AUTHZN_RQST_PROC_DT'] = pd.to_datetime(df['AUTHZN_RQST_PROC_DT'],format='%d%b%Y') # Convert to datetimes
# df['AUTHZN_RQST_PROC_TM'] = df['AUTHZN_RQST_PROC_DT']+ pd.to_datetime(df.AUTHZN_RQST_PROC_TM).dt.time
df['acct_id'] = df['acct_id'].astype(str)
df['AUTHZN_RQST_PROC_TM'] = df.apply(lambda x: getTime(x),1)
df['AUTHZN_APPRL_CD'] =pd.to_numeric(df['AUTHZN_APPRL_CD'], errors='coerce')
df['AUTHZN_APPRL_CD'] =df['AUTHZN_APPRL_CD'].astype(str)
df.MRCH_CNTRY_CD = df.MRCH_CNTRY_CD.astype(str)
df.MRCH_CATG_CD = df.MRCH_CATG_CD.astype(str)
df.AUTHZN_MSG_TYPE_MODR_CD = df.AUTHZN_MSG_TYPE_MODR_CD.astype(str)
df.RQST_CARD_SEQ_NUM = df.RQST_CARD_SEQ_NUM.astype(str)
df.ECMRC_SCURT_CD = df.ECMRC_SCURT_CD.astype(str)
df.ACQR_BIN_NUM = df.ACQR_BIN_NUM.astype(str)
df.PREV_ADR_CHNG_DT =pd.to_datetime(df.PREV_ADR_CHNG_DT,errors='coerce',format='%d%b%Y')
df.PREV_PMT_DT = pd.to_datetime(df.PREV_PMT_DT,errors='coerce',format='%d%b%Y')
df.PREV_CARD_RQST_DT = pd.to_datetime(df.PREV_CARD_RQST_DT,errors='coerce',format='%d%b%Y')
df.FRD_IND_SWT_DT = pd.to_datetime(df.FRD_IND_SWT_DT,errors='coerce',format='%d%b%Y')
# df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df[['AUTHZN_RQST_PROC_DT','AUTHZN_RQST_PROC_TM']],format='%Y%m%d%H')
# df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df.AUTHZN_RQST_PROC_DT.dt.strftime('%Y-%m-%d') +' '+ df.AUTHZN_RQST_PROC_TM.dt.strftime('%H'))
# df['PREV_ADR_CHNG_DT'] = pd.to_datetime(df['PREV_ADR_CHNG_DT'])
# df['PREV_PMT_DT'] = pd.to_datetime(df['PREV_PMT_DT'])
# df['PREV_CARD_RQST_DT'] = pd.to_datetime(df['PREV_CARD_RQST_DT'])
# df['FRD_IND_SWT_DT'] = pd.to_datetime(df['FRD_IND_SWT_DT'])
df.index += index_start
# Remove the un-interesting columns
columns = ['AUTHZN_RQST_PROC_DT','EXCSV_ACTVY_PARM_CD']
for c in df.columns:
if c in columns:
df = df.drop(c, axis=1)
j+=1
t_mid = dt.datetime.now()
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
# display(df)
# print df.dtypes
# break
# table = 'data_trim'
output = cStringIO.StringIO()
#ignore the index
df.to_csv(output, sep='\t', header=False, index=False)
#jump to start of stream
output.seek(0)
contents = output.getvalue()
cur = connection.cursor()
#null values become ''
cur.copy_from(output,table, null="")
connection.commit()
cur.close()
print '{} seconds: inserted {} rows'.format((dt.datetime.now() - t_mid).seconds, j*chunksize)
index_start = df.index[-1] + 1
In [ ]:
start = dt.datetime.now()
chunksize = 300000
j = 0
index_start = 1
###################data source
file_loc = data_dir+evt_name
########################
dtFormat = "%d%b%Y %H:%M:%S.%f"
def getTime(x):
dtString = "{} {}".format(x.AUTHZN_RQST_PROC_DT,x.AUTHZN_RQST_PROC_TM)
return dt.datetime.strptime(dtString,dtFormat)
for df in pd.read_csv(file_loc, chunksize=chunksize, iterator=True,encoding='ISO-8859-1'):
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns
# df['AUTHZN_RQST_PROC_DT'] = pd.to_datetime(df['AUTHZN_RQST_PROC_DT'],format='%d%b%Y') # Convert to datetimes
# df['AUTHZN_RQST_PROC_TM'] = df['AUTHZN_RQST_PROC_DT']+ pd.to_datetime(df.AUTHZN_RQST_PROC_TM).dt.time
df['acct_id'] = df['acct_id'].astype(str)
df['AUTHZN_RQST_PROC_TM'] = df.apply(lambda x: getTime(x),1)
df['AUTHZN_APPRL_CD'] =pd.to_numeric(df['AUTHZN_APPRL_CD'], errors='coerce')
df['AUTHZN_APPRL_CD'] =df['AUTHZN_APPRL_CD'].astype(str)
df.MRCH_CNTRY_CD = df.MRCH_CNTRY_CD.astype(str)
df.MRCH_CATG_CD = df.MRCH_CATG_CD.astype(str)
df.AUTHZN_MSG_TYPE_MODR_CD = df.AUTHZN_MSG_TYPE_MODR_CD.astype(str)
df.RQST_CARD_SEQ_NUM = df.RQST_CARD_SEQ_NUM.astype(str)
df.ECMRC_SCURT_CD = df.ECMRC_SCURT_CD.astype(str)
df.ACQR_BIN_NUM = df.ACQR_BIN_NUM.astype(str)
df.PREV_ADR_CHNG_DT =pd.to_datetime(df.PREV_ADR_CHNG_DT,errors='coerce',format='%d%b%Y')
df.PREV_PMT_DT = pd.to_datetime(df.PREV_PMT_DT,errors='coerce',format='%d%b%Y')
df.PREV_CARD_RQST_DT = pd.to_datetime(df.PREV_CARD_RQST_DT,errors='coerce',format='%d%b%Y')
df.FRD_IND_SWT_DT = pd.to_datetime(df.FRD_IND_SWT_DT,errors='coerce',format='%d%b%Y')
# df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df[['AUTHZN_RQST_PROC_DT','AUTHZN_RQST_PROC_TM']],format='%Y%m%d%H')
# df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df.AUTHZN_RQST_PROC_DT.dt.strftime('%Y-%m-%d') +' '+ df.AUTHZN_RQST_PROC_TM.dt.strftime('%H'))
# df['PREV_ADR_CHNG_DT'] = pd.to_datetime(df['PREV_ADR_CHNG_DT'])
# df['PREV_PMT_DT'] = pd.to_datetime(df['PREV_PMT_DT'])
# df['PREV_CARD_RQST_DT'] = pd.to_datetime(df['PREV_CARD_RQST_DT'])
# df['FRD_IND_SWT_DT'] = pd.to_datetime(df['FRD_IND_SWT_DT'])
df.index += index_start
# Remove the un-interesting columns
columns = ['AUTHZN_RQST_PROC_DT','EXCSV_ACTVY_PARM_CD']
for c in df.columns:
if c in columns:
df = df.drop(c, axis=1)
j+=1
t_mid = dt.datetime.now()
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
# display(df)
# print df.dtypes
# break
# table = 'data_trim'
df.to_sql(table, disk_engine, if_exists='append')
print '{} seconds: inserted {} rows'.format((dt.datetime.now() - t_mid).seconds, j*chunksize)
index_start = df.index[-1] + 1
In [ ]:
import sqlite3
def init_sqlite3(dbname):
conn = sqlite3.connect(dbname)
return conn
disk_engine = init_sqlite3(data_dir+db_name)
In [ ]:
drop_qry = '''DROP TABLE IF EXISTS {table}; '''
table = 'auth'
cursor.execute(drop_qry.format(table=table))
connection.commit()
In [ ]:
table = 'auth'
start = dt.datetime.now()
chunksize = 300000
j = 0
index_start = 1
file_loc = data_dir+auth_name
dtFormat = "%d%b%Y %H:%M:%S.%f"
def getTime(x):
dtString = "{} {}".format(x.AUTHZN_RQST_PROC_DT,x.AUTHZN_RQST_PROC_TM)
return dt.datetime.strptime(dtString,dtFormat)
for df in pd.read_csv(file_loc, chunksize=chunksize, iterator=True,encoding='ISO-8859-1'):
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns
# df['AUTHZN_RQST_PROC_DT'] = pd.to_datetime(df['AUTHZN_RQST_PROC_DT'],format='%d%b%Y') # Convert to datetimes
# df['AUTHZN_RQST_PROC_TM'] = df['AUTHZN_RQST_PROC_DT']+ pd.to_datetime(df.AUTHZN_RQST_PROC_TM).dt.time
df['AUTHZN_RQST_PROC_TM'] = df.apply(lambda x: getTime(x),1)
df['AUTHZN_APPRL_CD'] =pd.to_numeric(df['AUTHZN_APPRL_CD'], errors='coerce')
df['AUTHZN_APPRL_CD'] =df['AUTHZN_APPRL_CD'].astype(str)
df.MRCH_CNTRY_CD = df.MRCH_CNTRY_CD.astype(str)
df.MRCH_CATG_CD = df.MRCH_CATG_CD.astype(str)
df.AUTHZN_MSG_TYPE_MODR_CD = df.AUTHZN_MSG_TYPE_MODR_CD.astype(str)
df.RQST_CARD_SEQ_NUM = df.RQST_CARD_SEQ_NUM.astype(str)
df.ECMRC_SCURT_CD = df.ECMRC_SCURT_CD.astype(str)
df.ACQR_BIN_NUM = df.ACQR_BIN_NUM.astype(str)
df.PREV_ADR_CHNG_DT =pd.to_datetime(df.PREV_ADR_CHNG_DT,errors='coerce',format='%d%b%Y')
df.PREV_PMT_DT = pd.to_datetime(df.PREV_PMT_DT,errors='coerce',format='%d%b%Y')
df.PREV_CARD_RQST_DT = pd.to_datetime(df.PREV_CARD_RQST_DT,errors='coerce',format='%d%b%Y')
df.FRD_IND_SWT_DT = pd.to_datetime(df.FRD_IND_SWT_DT,errors='coerce',format='%d%b%Y')
# df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df[['AUTHZN_RQST_PROC_DT','AUTHZN_RQST_PROC_TM']],format='%Y%m%d%H')
# df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df.AUTHZN_RQST_PROC_DT.dt.strftime('%Y-%m-%d') +' '+ df.AUTHZN_RQST_PROC_TM.dt.strftime('%H'))
# df['PREV_ADR_CHNG_DT'] = pd.to_datetime(df['PREV_ADR_CHNG_DT'])
# df['PREV_PMT_DT'] = pd.to_datetime(df['PREV_PMT_DT'])
# df['PREV_CARD_RQST_DT'] = pd.to_datetime(df['PREV_CARD_RQST_DT'])
# df['FRD_IND_SWT_DT'] = pd.to_datetime(df['FRD_IND_SWT_DT'])
df.index += index_start
# Remove the un-interesting columns
columns = ['AUTHZN_RQST_PROC_DT','EXCSV_ACTVY_PARM_CD']
for c in df.columns:
if c in columns:
df = df.drop(c, axis=1)
j+=1
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
# display(df)
# print df.dtypes
# break
df.to_sql(table, disk_engine, if_exists='append')
index_start = df.index[-1] + 1
In [ ]:
In [ ]:
In [ ]:
table = 'auth'
In [ ]:
df_total = pd.read_sql_query('SELECT count(*) FROM {}'.format(table), disk_engine)
df_total.head()
In [ ]:
df_total_F = pd.read_sql_query('SELECT count(*) FROM {} where FRD_IND = "Y"'.format(table), disk_engine)
df_total_F.head()
In [ ]:
df_total_F_G = pd.read_sql_query('SELECT FRD_IND, count(*) as tr_num FROM {} group by FRD_IND'.format(table), disk_engine)
df_total_F_G.head()
In [ ]:
title = 'Fraud Distribution'
fig = {
'data': [{'labels': ['Fraud', 'Genuine'],
'values': [df_total_F_G['tr_num'][1], df_total_F_G['tr_num'][0]],
'type': 'pie'}],
'layout': {'title': title}
}
iplot(fig,filename='figures/'+title)
In [ ]:
df = pd.read_sql_query('SELECT * FROM {} LIMIT 3'.format(table), disk_engine)
df.head()
In [ ]:
col = 'acct_id'
df_dst_u = pd.read_sql_query('SELECT COUNT(DISTINCT {}) FROM {}'.format(col,table), disk_engine)
df_dst_u
In [ ]:
df_dst_u
select FRD_IND, count(*) from data group by FRD_IND;
In [ ]:
col = 'FRD_IND'
df = pd.read_sql_query('SELECT {0}, COUNT(*) as `num_fraud`'
'FROM {} '
'GROUP BY {0}'
'ORDER BY `num_trans` DESC'.format(col,table), disk_engine)
df.head()
In [ ]:
df_infr = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
'from {table} '
'group by FRD_IND'.format(table=table), disk_engine)
df_infr
In [ ]:
title = 'Fraud by Distinct Users'
fig = {
'data': [{'labels': ['Fraud', 'Genuine'],
'values': [df_infr['num_usr'][1], df_infr['num_usr'][0]],
'type': 'pie'}],
'layout': {'title': title}
}
iplot(fig,filename='figures/'+title)
In [ ]:
usr_ratio = df_infr['num_usr'][0]/ df_infr['num_usr'][1]
usr_ratio
select acct_id, count(*) as num_trans from data group by acct_id order by num_trans DESC;
In [ ]:
col = '*'
df = pd.read_sql_query('SELECT {0} '
'FROM {1} '
'where acct_id = "."'.format(col,table), disk_engine)
df.head(8)
In [ ]:
col = 'acct_id'
df_u_t = pd.read_sql_query('SELECT {0}, COUNT(*) as `num_trans` '
'FROM {1} '
'GROUP BY {0} '
'ORDER BY `num_trans` DESC'.format(col,table), disk_engine)
df_u_t.head(8)
In [ ]:
title = 'Transactions Histogram'
data = [
Histogram(
x=df_u_t['num_trans']
)
]
iplot(data,filename='figures/'+title)
In [ ]:
In [ ]:
df_u_t['num_trans'].median()
In [ ]:
df_u_t['num_trans'].mode()
In [ ]:
df_u_t['num_trans'].describe()
In [ ]:
df_u_t.boxplot(['num_trans'])
In [ ]:
trace0 = Box(
y=df_u_t['num_trans']
)
data = [trace0]
py.iplot(data)
In [ ]:
df_u_t.groupby('num_trans').count()
select acct_id, count(*) as num_fraud from data where FRD_IND = "Y" group by acct_id order by num_fraud DESC;
In [ ]:
col = 'acct_id'
df_u_ft = pd.read_sql_query('select {0}, count(*) as num_fraud '
'from {1} where FRD_IND = "Y" '
'group by {0} order by num_fraud DESC'.format(col,table), disk_engine)
df_u_ft.head()
In [ ]:
col = 'FRD_IND_SWT_DT'
count_nm = 'num_dates'
df_l_t = pd.read_sql_query('select {0}, count(*) as {2} '
'from {1} where FRD_IND = "Y" '
'group by {0} order by {2} DESC'.format(col,table,count_nm), disk_engine)
df_l_t.head()
In [ ]:
df_l_t['num_dates'].describe()
In [ ]:
col = 'FRD_IND_SWT_DT'
count_nm = 'num_dates'
df_l_t_all = pd.read_sql_query('select {0} '
'from {1} where FRD_IND = "Y" '
'order by {0} ASC'.format(col,table,count_nm), disk_engine)
df_l_t_all.head()
In [ ]:
title = 'Dates Histogram'
data = [
Histogram(
x=df_l_t_all['FRD_IND_SWT_DT']
)
]
iplot(data,filename='figures/'+title)
In [ ]:
df_l_t_all['FRD_IND_SWT_DT'].describe()
In [ ]:
df_l_t_all[pd.to_datetime(df_l_t_all['FRD_IND_SWT_DT'])>pd.to_datetime('2014-05-01')].shape
In [ ]:
frac = df_l_t_all[pd.to_datetime(df_l_t_all['FRD_IND_SWT_DT'])>pd.to_datetime('2014-05-01')].count()[0]
In [ ]:
total_dates = df_l_t_all.count()[0]
In [ ]:
frac*1.0/total_dates
In [ ]:
title = 'Confiramtions per Date'
trace0 = Bar(
x= df_l_t['FRD_IND_SWT_DT'],
y=df_l_t['num_dates'],
# text=['27% market share', '24% market share', '19% market share'],
# marker=dict(
# color='rgb(158,202,225)',
# line=dict(
# color='rgb(8,48,107)',
# width=1.5,
# )
# ),
# opacity=0.6
)
data = [trace0]
layout = go.Layout(
title=title,
)
fig = go.Figure(data=data, layout=layout)
iplot(data,filename='figures/'+title)
In [ ]:
col = 'AUTHZN_RQST_PROC_TM'
count_nm = 'num_dates'
df_t_time = pd.read_sql_query('select {0} '
'from {1} where FRD_IND = "Y" '
'order by {0} ASC'.format(col,table,count_nm), disk_engine)
df_t_time.head()
In [ ]:
table = 'auth'
subset_df = pd.read_sql_query('select * '
'from {table} limit 10000'.format(table=table), disk_engine)
In [ ]:
pclass_xt = pd.crosstab(subset_df['acct_id'], subset_df['FRD_IND'])
display(pclass_xt)
In [ ]:
pclass_xt.drop(pclass_xt.index[[np.arange(10,200)]],inplace=True)
In [ ]:
pclass_xt
In [ ]:
pclass_xt_pct = pclass_xt.div(pclass_xt.sum(1).astype(float), axis=0)
pclass_xt_pct
pclass_xt_pct.plot(kind='bar',
stacked=True,
title='Distribution of transactions per user')
plt.xlabel('Genuine')
plt.ylabel('Fraud')
In [ ]:
select acct_id, AUTHZN_RQST_PROC_TM,FRD_IND from data where acct_id = 337018623;
In [ ]:
select FRD_IND, count(distinct acct_id) from data group by FRD_IND;
In [ ]:
def encode_column(df_col):
In [ ]:
encoders = {}
for c,r in enumerate(df):
tp = df.dtypes[c]
if tp == 'object':
encoders[r] = encode_column(df[r])
encoders
In [ ]:
df_ds_u = pd.read_sql_query('select distinct acct_id, FRD_IND '
'from {table} '
'order by FRD_IND'.format(table=table), disk_engine)
df_ds_u
In [ ]:
users = set()
cnt = 0
head = 0
tail = len(df_ds_u.acct_id)-1
sample_size = tail
for i in range(sample_size):
if cnt<usr_ratio:
users.add(df_ds_u.acct_id[head])
cnt+=1
head+=1
else:
users.add(df_ds_u.acct_id[tail])
tail-=1
cnt=0
In [ ]:
def generate_sequence(user,table):
df = get_user_info(user,table)
for
In [ ]:
sequnences = []
for user in users:
sequences.append(generate_sequence(user,table))
In [ ]:
for r in df.dtypes:
print r
In [ ]:
import keras
In [ ]:
(1e-3)*10
In [ ]:
gs_results_dir = 'gs_results_little.csv'
df_GS = pd.read_csv(data_dir+gs_results_dir,header=None)
In [ ]:
df_GS.sort_values(7,ascending=False)
In [ ]:
help(df_GS.sort_values,ascending=True)
In [ ]:
gs_results_dir = 'gs_results_trim.csv'
df_GS_trim = pd.read_csv(data_dir+gs_results_dir,header=None)
df_GS_trim.sort_values(8)
In [ ]:
table = 'data_more'
dataFrame = pd.read_sql_query('select acct_id, count(*) as num_trans '
'from {table} '
''
'group by acct_id '
'order by num_trans'
.format(table=table), disk_engine)
dataFrame
In [ ]:
encoders
In [ ]: